knitr::opts_chunk$set(echo = TRUE)
invisible(lapply(
  c("dplyr", "data.table", "matrixStats", "ggplot2", "reshape2",
    "ggrepel", "pROC", "stringr", "tibble", "viridis", "ggridges"),
  function(pkg) suppressPackageStartupMessages(library(pkg, character.only = TRUE))
))

Data prep

##########################################################################################
## LSHTM: Maria’s data that she used for the hvCpG paper can be accessed from ing-p5 here:
# /mnt/old_user_accounts/p3/maria/PhD/Data/datasets/
#   
# Her hvCpG scripts are here:
# /mnt/old_user_accounts/p3/maria/PhD/Projects/hvCpGs/Scripts/

## Load results of Maria for hvCpG
Marias_hvCpGs <- read.csv("~/2024_hvCpG/03_prepDatasetsMaria/Derakhshan2022_ST5_hvCpG.txt")

## Recreate Maria's datasets
folder1 <- normalizePath("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/")
rds_files1 <- list.files(path = folder1, pattern = "\\.RDS$", full.names = TRUE)

# Read all .rds files into a list and convert each to a matrix
rds_list_mat1 <- lapply(rds_files1, function(file) {as.matrix(readRDS(file))})
## Name the list elements by file names (without extensions)
names(rds_list_mat1) <- gsub("\\.RDS$", "", basename(rds_files1))

rds_list_mat2 <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/TCGA/TCGA_BMIQ_age_sex_PC_adjusted_OUTLIERS_REMOVED_round2.RDS")
rds_list_mat2 <- lapply(rds_list_mat2, as.matrix)

rds_list_mat <- c(rds_list_mat1, rds_list_mat2) ; rm(rds_list_mat1, rds_list_mat2)

# Keep only the array background (covered in 15 datasets out of 30)
all_cpgs <- unlist(lapply(rds_list_mat, rownames))
cpg_counts <- table(all_cpgs)
common_cpgs <- names(cpg_counts[cpg_counts >= 15])

rds_list_mat <- lapply(rds_list_mat, function(mat) {
  mat[rownames(mat) %in% common_cpgs, ]
})

## Load mQTL-matched controls
cistrans_GoDMC_hvCpG_matched_control <- 
  read.table("~/2024_hvCpG/03_prepDatasetsMaria/cistrans_GoDMC_hvCpG_matched_control.txt", header = T)

## Load cpgnames (all cpg covered after filtration)
cpg_names_all <- rhdf5::h5read("/home/alice/arraysh5/all_matrix_noscale.h5", "cpg_names")

## Check if I'm consistent
table(cpg_names_all %in% common_cpgs) ## PERFECT
## 
##   TRUE 
## 394240
sub_cistrans_GoDMC_hvCpG_matched_control <- cistrans_GoDMC_hvCpG_matched_control[
  cistrans_GoDMC_hvCpG_matched_control$hvCpG_name %in% cpg_names_all &
    cistrans_GoDMC_hvCpG_matched_control$controlCpG_name %in% cpg_names_all,]

## Histogram of samples in array data
df <- data.frame(n_samples = sapply(rds_list_mat, ncol))

# Plot with ggplot2
ggplot(df, aes(x = n_samples)) +
  geom_histogram(bins = 100, fill = "steelblue", color = "white") +
  theme_minimal(base_size = 14) +
  labs(
    title = "Distribution of number of samples per dataset",
    x = "Number of samples",
    y = "Count of datasets"
  )

rm(all_cpgs, cpg_counts,folder1, rds_files1, cistrans_GoDMC_hvCpG_matched_control)

Distribution of methylation values for array after transformation

# combine everything into one vector
# all_values <- unlist(rds_list_mat, use.names = FALSE)
# 
# # plot one histogram for all values
# hist(all_values, breaks = 100,
#      main = "Histogram of all values in rds_list_mat",
#      xlab = "Values")

# rm(all_values)

Reproduce Maria’s results

Maria’s paper: We defined an hvCpG in the following way:

  1. in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs.
  2. is covered in at least 15 of the 30 datasets.
detecthvCpGCutoff <- function(rds_list_mat){
  ## Within each dataset, calculate the CpGs variance, and keep the top 5%
  top5pcvar <- lapply(rds_list_mat, function(mat) {
    # Step 1: Calculate row variances
    row_variances = rowVars(mat, na.rm = T)
    df = data.frame(var=row_variances, cpg=names(row_variances))
    # Step 2
    top = top_n(df, as.integer(0.05*nrow(df)), var) ## slightly different than quantile cause keep ties
    return(top)
  })
  
  ## CpGs in the top 5% variance:
  cpg_counts_top <- table(unlist(lapply(top5pcvar, rownames))) %>%
    data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs_top5pc"="Freq")
  
  ## all covered CpGs:
  cpg_counts <- table(unlist(lapply(rds_list_mat, rownames))) %>%
    data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs"="Freq")
  
  ## Both:
  cpg_counts_full <- merge(cpg_counts, cpg_counts_top, all.x = T)
  rm(cpg_counts, cpg_counts_top)
  
  ## in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs:
  ## rounding, to mimick Maria's approach
  hvCpGs_repro_maria <- na.omit(cpg_counts_full[round(cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs, 2) >= 0.65,])
  
  return(hvCpGs_repro_maria)
}

hvCpGs_repro_maria <- detecthvCpGCutoff(rds_list_mat = rds_list_mat)

Maria detected 4143 hvCpGs. I detected 4377 hvCpGs. We both have 4143 hvCpGs in common.

What if we had only 3 people per dataset?

set.seed(123)  # for reproducibility

# Keep 3 random columns per element
rds_list_mat_3ind <- lapply(rds_list_mat, function(df) {
  cols <- sample(ncol(df), 3)   # choose 3 random columns
  df[, cols, drop = FALSE]      # subset while keeping data.frame
})

hvCpGs_repro_maria_3ind <- detecthvCpGCutoff(rds_list_mat = rds_list_mat_3ind)

## Plot 

# install.packages("ggVennDiagram")
library(ggVennDiagram)

# List of items
x <- list("Full array" = hvCpGs_repro_maria$cpgs, 
          "Array 3ind/ds" = hvCpGs_repro_maria_3ind$cpg)

p <- ggVennDiagram(x, label_alpha = 0) +  scale_fill_gradient(low = "#F4FAFE", high = "#4981BF") 
p

# 2D Venn diagram
ggsave("../../figures/arrayCutoffLowPower3ind.pdf", plot = p, width = 5, height = 5)

Identify hvCpGs based on a sd multiplicative factor lambda

The proportion of CpGs than Maria found hvCpGs is p(hvCpG) = 1.02%.

Calculate lambda

Within each dataset k, calculate the median sd of all CpG j

all_sd_jk <- sapply(rds_list_mat, function(k){
  
  get_sd_k <- function(k){
    ## Calculate a vector of the row (=per CpG j) sd
    return(rowSds(k, na.rm = T))
  }
  ## Return a vector of sds, in a list for each dataset 
  return(get_sd_k(k))
})

## Plot:
df_long <- reshape2::melt(all_sd_jk, variable.name = "Vector", value.name = "SDs")

## Plot distributions of all vectors on the same graph
ggplot(df_long, aes(x = SDs, color = L1)) +
  geom_density() +
  labs(title = "Distribution of SDs accross datasets",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()

## Calculate lambda per dataset
lambdas = sapply(all_sd_jk, function(x){quantile(x, 0.95, na.rm=T)/median(x, na.rm=T)})
names(lambdas) <- gsub(".95%", "", names(lambdas))
# Histogram with kernel density
ggplot(data.frame(lambda=lambdas, dataset=names(lambdas)),
       aes(x = lambdas)) +
  geom_histogram(aes(y = after_stat(density)),
                 colour = 1, fill = "white", binwidth = .1) +
  geom_density(lwd = 1, colour = 4,
               fill = 4, alpha = 0.25)+
  theme_minimal() +
  geom_vline(xintercept = median(lambdas), col = "red")

median(lambdas)
## [1] 4.192186

Which datasets have a higher lambda, and why?

df = data.frame(nind=sapply(rds_list_mat, ncol),
                dataset=names(rds_list_mat),
                lambda=lambdas, 
                tissue = sapply(strsplit(names(rds_list_mat), "_"), `[`, 1),
                ethnicity=sapply(strsplit(names(rds_list_mat), "_"), `[`, 2)) %>%
  mutate(dataset=ifelse(dataset %in% c("Blood_Cauc", "Blood_Hisp"), "Blood_Cauc_Hisp", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("Blood_Mexican", "Blood_PuertoRican "), "Blood_Mex_PuertoRican ", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("CD4+_Estonian", "CD8+_Estonian"), "CD4+_CD8+_Estonian", dataset)) %>% 
  mutate(dataset=ifelse(dataset %in% c("Saliva_Hisp", "Saliva_Cauc"), "Saliva_Hisp_Cauc", dataset))

ggplot(data = df,
       aes(x=lambda, y=nind))+
  geom_smooth(method = "lm")+
  geom_point()+
  geom_label_repel(aes(label = dataset, fill=dataset), size= 2, alpha=.8, max.overlaps = 25)+
  theme_bw()+theme(legend.position = "none")+
  scale_x_log10()
## `geom_smooth()` using formula = 'y ~ x'

## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "uterus"] <- "red"

ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
  geom_density(data = df_long[!df_long$L1 %in% "uterus",], alpha = .5) +
  geom_density(data = df_long[df_long$L1 %in% "uterus",], alpha = .6) +
  scale_fill_manual(values = c("grey", "red"))+
  labs(title = "Distribution of SDs accross datasets",subtitle = "Red=uterus",
       x = "SDs",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_x_sqrt()

Maximum likelihood analysis

The equation is:

\[ \log\left(P(M_j)\right) = \sum_{i=1}^{n} \log\left( \sum_{Z_j=0}^{1} \left( \sum_{Z_{j,k}=0}^{1} P(M_{i,j} \mid Z_{j,k}) \times P(Z_{j,k} \mid Z_j) \right) \times P(Z_j) \right) \]

Examine weird values

The transformation lead to weird values:

test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/Raw_cleaned_beta_matrices_GEO/Blood_Hisp")
test[rownames(test) %in% "cg23089912",5:10]
##            GSM1870975 GSM1870977 GSM1870979 GSM1870986 GSM1870989 GSM1870992
## cg23089912  0.8486523  0.8883389  0.8610275          0  0.8572614  0.8153023
test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/Blood_Hisp.RDS")
test[rownames(test) %in% "cg23089912",5:10]
##             GSM1870975  GSM1870977 GSM1870979   GSM1870986  GSM1870989
## cg23089912 0.001379438 0.002837506  0.0086327 1.124546e-17 0.002056392
##              GSM1870992
## cg23089912 0.0001942933
# GSM1870986
# 0.00000000000000001124546 --> give extreme value in scaling, and p dnorm=0

The zero was weirdly transformed. Is that expected?

Check specific probe detected in atlas data

cpgs_of_interest <- c("cg27470047", "cg16958594", "cg01992382", "cg19313311")

library(matrixStats)

# Step 1: compute SDs per CpG per dataset
sds_per_dataset <- lapply(rds_list_mat, function(x) {
  subset_mat <- x[rownames(x) %in% cpgs_of_interest, , drop = FALSE]
  if (nrow(subset_mat) == 0) {
    return(NULL)
  }
  rowSds(as.matrix(subset_mat), na.rm = TRUE) |>
    setNames(rownames(subset_mat))
})

# Step 2: bind results into a data.frame
sds_df <- do.call(cbind, sds_per_dataset)
## Warning in (function (..., deparse.level = 1) : number of rows of result is not
## a multiple of vector length (arg 7)
# Step 3: median SD per CpG across datasets
median_sd_per_cpg <- apply(sds_df, 1, median, na.rm = TRUE)
median_sd_per_cpg = melt(median_sd_per_cpg) %>% rownames_to_column("CpG")

sds_df = melt(sds_df)
names(sds_df) = c("CpG", "dataset", "sd")

# Step 4: histogram of SD distributions per group
ggplot(sds_df, aes(x = sd)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  facet_wrap(~ CpG, scales = "free_y") +
  theme_minimal() +
  labs(title = "Distribution of SD per CpG",
       x = "Standard deviation across samples",
       y = "Count")+
  geom_vline(data = median_sd_per_cpg, aes(xintercept = value))

ROC curves to define p0 and p1 matching Maria’s approach

\[ p0 = P(Z_{j,k}=0 \mid Z_j=0) \\ p1 = P(Z_{j,k}=1 \mid Z_j=1) \]

p1 = 0.65 in Maria’s approach (to be a hvCpG, it needs to be hypervariable in >65% of the datasets)

## Load pos2check that was used in the grid run
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_30datasets_6906CpGs_0_8p0_0_65p1.RData")

cpgNamesgridrun <- rownames(results_MariasarraysREDUCED_allsamples_30datasets_6906CpGs_0_8p0_0_65p1)

# Step 1: Read results and compute AUC + threshold
files <- list.files(
  "../../resultsDir/Arrays/hvCpGandControls",
  pattern = "^results_Maria_6906CpGs_.*\\.RData$",
  full.names = TRUE
)

auc_df <- tibble(p0=double(), p1=double(), AUC=double(), threshold=double())
roc_list <- list()
 
for (f in files) {
  if (file.info(f)$size == 0) next
  nm <- load(f)
  res <- get(nm)

  pstr <- str_match(f, "_(\\d+(?:_\\d+)?)p0_(\\d+(?:_\\d+)?)p1")[, 2:3]
  p0 <- as.numeric(gsub("_", ".", pstr[1]))
  p1 <- as.numeric(gsub("_", ".", pstr[2]))
  label <- sprintf("p0=%.2f p1=%.2f", p0, p1)

  df <- as.data.frame(res) %>% rownames_to_column("CpGpos") %>%
    mutate(CpG=cpgNamesgridrun)
  colnames(df)[2] <- "alpha"

  truth <- ifelse(df$CpG %in% Marias_hvCpGs$CpG, 1, 0)
  roc_obj <- try(roc(truth, df$alpha, quiet=TRUE), silent=TRUE)
  if (inherits(roc_obj, "try-error")) next

  aucv <- as.numeric(auc(roc_obj))
  thr <- coords(roc_obj, "best", ret = "threshold", transpose = FALSE)[["threshold"]]
  auc_df <- auc_df %>% add_row(p0, p1, AUC=aucv, threshold=thr)
  pts <- coords(roc_obj, "all", ret=c("sensitivity","specificity"),
                transpose = FALSE) %>%
    as_tibble() %>%
    mutate(FPR = 1 - specificity, TPR = sensitivity, label=label)
  roc_list[[label]] <- pts
}

roc_all <- bind_rows(roc_list)

# Step 2: Plot ROC curves
roc_plot <- ggplot(roc_all, aes(FPR, TPR, group=label, color=label)) +
  geom_line(linewidth=0.8, alpha=0.6) +
  geom_abline(slope=1, linetype="dashed", color="gray50") +
  labs(title="ROC Curves for Different (p0,p1)",
       x="False Positive Rate", y="True Positive Rate") +
  theme_minimal() + theme(legend.position="none")

# Step 3: AUC heatmap with thresholds

### To avoid multiple
auc_df <-   auc_df %>% distinct(p0, p1, .keep_all = TRUE)

best <- auc_df |>
  filter(AUC == max(AUC, na.rm = TRUE)) |>
  slice(1)  # in case of ties

heatmap_plot <- ggplot(auc_df, aes(p0, p1, fill=AUC)) +
  geom_tile(color="white") +
  # geom_text(aes(label=round(threshold,2)),
  #           size=3, color="black", check_overlap = TRUE) +
  # Outline best tile
  geom_rect(xmin = 0.8-0.025, xmax = 0.8+0.025, ymin = 0.65-0.025, ymax = 0.65+0.025,
            fill = NA, color = "red", linewidth = 1) +
  scale_fill_viridis_c(option = "plasma") +
  labs(title="AUC & Optimal Threshold per (p0,p1)",
       x="p0", y="p1") +
  theme_minimal(base_size = 14) + theme(plot.title = element_blank())
# Step 4: Show both plots
print(roc_plot)

print(heatmap_plot)

auc_df[auc_df$p0 %in% 0.8 & auc_df$p1 %in% 0.65,]
## # A tibble: 1 × 4
##      p0    p1   AUC threshold
##   <dbl> <dbl> <dbl>     <dbl>
## 1   0.8  0.65 0.986     0.735

The best threshold ploted in tiles is the alpha probability that gives the best sensitivity-specificity trade-off. It’s derived using a statistical rule (Youden’s J: maximizes (Sensitivity + Specificity – 1), giving the best overall balance between true positives and true negatives). It classifies a CpG as hypervariable in Maria’s approach if alpha > threshold

Conclusions:

Very big discrepancies between Maria’s approach and mine for high p1 with low p0.

Subset Maria’s data following Atlas data structure (less N/dataset) to check power of detection in Atlas data:

load("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_Maria_6906CpGs_0_8p0_0_65p1.RData")
df1 = results_Maria_pos2check_0_8p0_0_65p1

myplots <- function(df2, N){
  plot1 = data.frame(df1, df2) %>%
    tibble::rownames_to_column("CpGpos") %>% 
    mutate(CpGname =  cpg_names_all[as.numeric(CpGpos)]) %>% 
    mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
    dplyr::rename(alpha_full = alpha, alpha_reduce = alpha.1) %>% 
    ggplot(aes(x = alpha_full, y = alpha_reduce)) +
    geom_point(aes(fill=ishvCpG), pch =21, size =3, alpha = .3) + 
    geom_abline(slope = 1, linetype = "dashed") + theme_bw()
  
  # === 1) Prepare the ORIGINAL ===
  df_original = df1 %>%
    data.frame() %>%
    tibble::rownames_to_column("CpGpos") %>%
    mutate(CpGname =  cpg_names_all[as.numeric(CpGpos)]) %>% 
    mutate(ishvCpG = ifelse(
      CpGname %in% Marias_hvCpGs$CpG,
      "1500 hvCpG in Maria's study",
      "1500 mQTL matching controls"
    ),
    source = "original") %>%
    melt()
  
  # === 2) Prepare the MIMIC ===
  df_mimic = df2 %>%
    data.frame() %>%
    tibble::rownames_to_column("CpGname") %>%
    mutate(ishvCpG = ifelse(
      CpGname %in% Marias_hvCpGs$CpG,
      "1500 hvCpG in Maria's study",
      "1500 mQTL matching controls"
    ),
    source = "mimicAtlas") %>%
    melt()
  
  # === 3) Combine ===
  df_combined = bind_rows(df_original, df_mimic)
  
  # === 4) Plot ===
  plot2 = ggplot(df_combined, aes(x = value, y = ishvCpG, fill = ..x..)) +
    geom_density_ridges_gradient(scale = 3, rel_min_height = .01) +
    facet_wrap(~source, ncol = 1) +
    scale_fill_viridis(name = "alpha", option = "C") +
    theme_bw() +
    theme(axis.title.y = element_blank()) +
    labs(title = "Probability alpha of being hypervariable in my algorithm",
         subtitle = paste0("Original array data vs reduced (3 samples in ", N, " datasets)"))
  
  print(plot1)
  print(plot2)
}

for (i in c(3,20)){
  load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
  myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables

## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891

## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891

for (i in c(3,20)){
  load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_4samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
  myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables

## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891

## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891

for (i in c(3,20)){
  load(paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_5samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData"))
  myplots(df2 = get(paste0("results_MariasarraysREDUCED_3samples_", i, "datasets_6906CpGs_0_8p0_0_65p1")), N=i)
}
## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables

## Picking joint bandwidth of 0.0435
## Picking joint bandwidth of 0.0891

## Using CpGpos, CpGname, ishvCpG, source as id variables
## Using CpGname, ishvCpG, source as id variables
## Warning: Removed 767 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Picking joint bandwidth of 0.0308
## Picking joint bandwidth of 0.0891

Calculate AUC the ROC curve

# Initialize result data frame
auc_results <- data.frame(NbrDataset = integer(), AUC = numeric(), NbrSamples = integer())

# Loop through number of samples
for (s in 3:5) {
  for (i in 3:30) {
    # Load result
    file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_", s, "samples_", i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
    if (!file.exists(file_path)) next
    load(file_path)
    df2 <- get(paste0("results_MariasarraysREDUCED_", s, "samples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))
    ## Which proportion of the CpG tested are covered?
    propCovered <- table(is.na(df2))[1]/nrow(df2)
    # Build mimic dataframe with label
    df_mimic <- df2 %>%
      data.frame() %>%
      tibble::rownames_to_column("CpGname") %>%
      mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
    # Identify alpha column
    score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
    # Compute ROC and AUC
    roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
    auc_val <- as.numeric(auc(roc_obj))
    # Store result
    auc_results <- bind_rows(auc_results, tibble(
      NbrDataset = i,
      AUC = auc_val,
      NbrSamples = s,
      propCovered = propCovered
    ))
  }
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## For full dataset
# Loop through number of samples
for (i in 3:30) {
  # Load result
  file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_", 
                      i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
  if (!file.exists(file_path)) next
  load(file_path)
  df2 <- get(paste0("results_MariasarraysREDUCED_allsamples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))
  ## Which proportion of the CpG tested are covered?
  propCovered <- table(is.na(df2))[1]/nrow(df2)
  # Build mimic dataframe with label
  df_mimic <- df2 %>%
    data.frame() %>%
    tibble::rownames_to_column("CpGname") %>%
    mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
  # Identify alpha column
  score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
  # Compute ROC and AUC
  roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
  auc_val <- as.numeric(auc(roc_obj))
  # Store result
  auc_results <- bind_rows(auc_results, tibble(
    NbrDataset = i,
    AUC = auc_val,
    NbrSamples = 15,
    propCovered = propCovered
  ))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_results$NbrSamples[auc_results$NbrSamples %in% 15] <- "all"

ggplot(auc_results, aes(x = NbrDataset, y = AUC, color = factor(NbrSamples), group = NbrSamples)) +
  geom_hline(linetype = 5, yintercept = 0.9) +
  geom_line(size = 1.1) +
  geom_point(data = auc_results, aes(size = propCovered)) +
  scale_color_brewer(palette = "Set1") +
  labs(
    x = "Number of datasets",
    y = "AUC",
    color = "Samples\nper dataset",
  ) +
  guides(
    color = guide_legend(order = 1),
    size = guide_legend(order = 2)
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = c(0.95, 0.05),
    legend.justification = c("right", "bottom"),
    legend.box = "horizontal",
    legend.background = element_rect(fill = "white", color = "black", size = 0.4),
    legend.key = element_rect(fill = "white", color = NA),
    plot.margin = margin(5.5, 20, 5.5, 5.5, "pt"),
    panel.clip = "off"
  ) +
  coord_cartesian(clip = "off") +
  scale_x_continuous(breaks = 3:30)+
  scale_size(
    range = c(0.1, 7),
    breaks = c(0.2, 0.4, 0.6, 0.8),  # Adjust depending on your data
    name = "Proportion of\nCpGs covered"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in plot_theme(plot): The `panel.clip` theme element is not defined in
## the element hierarchy.

auc_results <- data.frame(NbrDataset = integer(), AUC = numeric(), NbrSamples = integer())

i = 16
# Load result
file_path <- paste0("~/2024_hvCpG/05_hvCpGalgorithm/resultsDir/Arrays/hvCpGandControls/results_MariasarraysREDUCED_allsamples_", 
                    i, "datasets_6906CpGs_0_8p0_0_65p1.RData")
if (!file.exists(file_path)) next
load(file_path)
df2 <- get(paste0("results_MariasarraysREDUCED_allsamples_", i, "datasets_6906CpGs_0_8p0_0_65p1"))

## Which proportion of the CpG tested are covered?
propCovered <- (table(is.na(df2))/(sum(table(is.na(df2)))))[[1]]

# Build mimic dataframe with label
df_mimic <- df2 %>%
  data.frame() %>%
  tibble::rownames_to_column("CpGname") %>%
  mutate(ishvCpG = ifelse(CpGname %in% Marias_hvCpGs$CpG, 1, 0))
# Identify alpha column
score_col <- if ("alpha" %in% colnames(df_mimic)) "alpha" else colnames(df_mimic)[2]
# Compute ROC and AUC
roc_obj <- roc(response = df_mimic$ishvCpG, predictor = df_mimic[[score_col]])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- as.numeric(auc(roc_obj))
# Store result
auc_results <- bind_rows(auc_results, tibble(
  NbrDataset = i,
  AUC = auc_val,
  NbrSamples = 15
))

auc_results
##   NbrDataset       AUC NbrSamples
## 1         16 0.9790176         15

Notes:

lambdas are defined based on a 5% threshold, even if alpha is not. How to not hardcode them? MCMC on lambda? Link with alpha?

The analysis is complete.